The 10-armed testbed: learn by implementing!

Author: Rodrigo Chang

I was reading Sutton and Barto's book (2ed) on Reinforcement Learning, section 2.3, where they explain about the k-armed bandit problem and I realized that maybe it would not be so hard to implement an interactive version of figure 2.2 of the book. I share my work here to show that sometimes you can implement something as you read and that is a great way to learn and practice the concepts you are studying.

My workflow:

I created the module Bandits and started writing functions and trying them in next cells.
When I was happy with the result, I added the interactivity and plots.

Have fun!

xxxxxxxxxx

19.8 μs

xxxxxxxxxx
 
begin
    import Pkg
    # activate a clean environment
    Pkg.activate(mktempdir())
​
    Pkg.add([
        Pkg.PackageSpec(name="Plots"),
        Pkg.PackageSpec(name="PlutoUI"),
    ])
​
    using Plots
    using PlutoUI
end

7.2 s

Main.workspace24.Bandits

xxxxxxxxxx
 
# Provides functions to K-armed bandits example
module Bandits
​
    # Action selection algorithm
    function epsilon_greedy(Q_a, epsilon) 
        if rand() < epsilon
            action_index = rand(1:length(Q_a))
        else
            action_index = argmax(Q_a) 
        end
        action_index
    end
​
    function run_episode(k=10, steps=1000, epsilon=0)
        Q_star = randn(k)
        Q_a = zeros(k) # Ac
        N_a = zeros(Int, k) # number of times choosing every bandit
        rewards = zeros(steps)
​
        for i in 1:steps
            # select action
            a = epsilon_greedy(Q_a, epsilon)
            # get reward from bandit, ~  N(Q_star(a), 1)
            r = Q_star[a] + randn()
            N_a[a] += 1 
            # Incremental implementation
            Q_a[a] +=  (1/N_a[a]) * (r - Q_a[a])
​
            rewards[i] = maximum(Q_a)
        end
        
        rewards
    end
​
    # This runs several episodes with different bandits to measure 
    # performance of ϵ-greedy algorithm
    function run_series(episodes=2000, steps=1000, k = 10, epsilon=0)
        rewards = zeros(steps)
        for j in 1:episodes 
            rewards .+= run_episode(k, steps, epsilon)
        end
        rewards / episodes
    end
​
​
    # For optimal action selection
    function run_episode_opt(k=10, steps=1000, epsilon=0)
        Q_star = randn(k)
        Q_a = zeros(k)
        N_a = zeros(Int, k)
        opt_choice_times = 0
        opt_choice = zeros(steps)
​
        for i in 1:steps
            # select action
            a = epsilon_greedy(Q_a, epsilon)
            # get random reward from bandit, (distributes) ~ Normal(Q_star(a), 1)
            r = Q_star[a] + randn()
            N_a[a] += 1 
            Q_a[a] +=  (1/N_a[a]) * (r - Q_a[a])
​
            opt_choice_times += argmax(Q_star) == a
            opt_choice[i] = opt_choice_times / i
        end
​
        opt_choice 
    end
​
    # This runs several episodes with different bandits to measure 
    # performance of action selection of ϵ-greedy algorithm
    function run_series_choice(episodes=2000, steps=1000, k = 10, epsilon=0)
        opt_choice = zeros(steps)
        for j in 1:episodes 
            opt_choice .+= run_episode_opt(k, steps, epsilon)
        end
        opt_choice / episodes
    end
​
end

11.0 ms

$ϵ =$ 0.1

xxxxxxxxxx

154 ms

You like to explore a little!

xxxxxxxxxx

6.7 ms

Number of armed bandits $k =$ 10

xxxxxxxxxx

13.7 ms

Steps = 1000

xxxxxxxxxx

8.2 ms

Episodes = 2000

xxxxxxxxxx

46.0 μs

xxxxxxxxxx
 
begin
    # Average reward and % of time of optimal choice
    performance_e1 = Bandits.run_series(episodes, steps, k, ϵ)
    opt_choice = 100 * Bandits.run_series_choice(episodes, steps, k, ϵ)
    
    p1 = plot(1:steps, performance_e1; 
        linewidth = 2,
        label = "Average reward", 
        legend = :bottomright)
    title!("Average reward over $episodes episodes with ϵ=$ϵ")
    p2 = plot(1:steps, opt_choice; 
        linewidth = 2,
        label = "% optimal action", 
        legend = :bottomright)
    plot(p1, p2, layout = (2, 1))
end

4.3 s